rm(list = ls())
cat("\014")
if(!require(tidytext)) {
install.packages("tidytext")
library(tidytext)
}
if(!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
if(!require(stringr)) {
install.packages("stringr")
library(stringr)
}
if(!require(tm)) {
install.packages("tm")
library(tm)
}
if(!require(tidyr)) {
install.packages("tidyr")
library(tidyr)
}
if(!require(gutenbergr)) {
install.packages("gutenbergr")
library(gutenbergr)
}
if(!require(scales)) {
install.packages("scales")
library(scales)
}
if(!require(ggplot2)) {
install.packages("ggplot2")
library(ggplot2)
}
if(!require(wordcloud)) {
install.packages("wordcloud")
library(wordcloud)
}
if(!require(reshape2)) {
install.packages("reshape2")
library(reshape2)
}
if(!require(igraph)) {
install.packages("igraph")
library(igraph)
}
if(!require(ggraph)) {
install.packages("ggraph")
library(ggraph)
}
if(!require(widyr)) {
install.packages("widyr")
library(widyr)
}
Para o trabalho foram escohidos textos relativos à Segunda Guerra Mundial.
France <- gutenberg_download(c(17813, 16437, 45542, 18483,9975), meta_fields = "title")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Dardanelles <- gutenberg_download(c(11008, 11513, 15896, 3306, 35119), meta_fields = "title")
#Russia <- gutenberg_download(c(46191, 10967, 22523, 10972, 53482), meta_fields = "title")
Russia <- gutenberg_download(c(46191, 10967, 22523, 10972), meta_fields = "title")
france_books <- France %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number()) %>%
ungroup()
dardanelles_books <- Dardanelles %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number()) %>%
ungroup()
russia_books <- Russia %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number()) %>%
ungroup()
france_tidy <- france_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
drop_na()
dardanelles_tidy <- dardanelles_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
drop_na()
russia_tidy <- russia_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
drop_na()
count(france_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 french 599
## 2 war 589
## 3 day 518
## 4 german 441
## 5 time 422
## 6 paris 396
## 7 france 386
## 8 captain 248
## 9 germans 237
## 10 left 222
count(dardanelles_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 ken 699
## 2 time 444
## 3 roy 373
## 4 day 312
## 5 night 294
## 6 war 259
## 7 left 246
## 8 water 244
## 9 turks 239
## 10 front 217
count(russia_tidy, word, sort=TRUE) %>% top_n(10)
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 russian 965
## 2 american 798
## 3 british 568
## 4 russia 520
## 5 company 480
## 6 archangel 457
## 7 time 434
## 8 front 415
## 9 soldiers 383
## 10 day 382
# Frequency as per the book
#frequency <- bind_rows(mutate(france_tidy, subject = "France"),
# mutate(dardanelles_tidy, subject = "Dardanelles"),
# mutate(russia_tidy, subject = "Russia")) %>%
# mutate(word = str_extract(word, "[a-z']+")) %>%
# count(subject, word) %>%
# group_by(subject) %>%
# mutate(proportion = n / sum(n)) %>%
# select(-n) %>%
# spread(subject, proportion) %>%
# gather(subject, proportion,`Personal Narratives`:`WWII Fiction`)
# Frequency step-by-step
binded_texts <- bind_rows(mutate(france_tidy, subject = "France"),
mutate(dardanelles_tidy, subject = "Dardanelles"),
mutate(russia_tidy, subject = "Russia"))
counted_texts <- count(binded_texts, subject, word)
grouped_texts <- group_by(counted_texts,subject)
grouped_texts_by_proportion <- mutate(grouped_texts, proportion = n / sum(n))
selected_texts <- select(grouped_texts_by_proportion,-n)
spreaded_text <- spread(selected_texts, subject, proportion)
frequency <- gather(spreaded_text,subject, proportion,`Dardanelles`:`France`)
ggplot(frequency, aes(x = proportion, y = `Russia`, color = abs(`Russia` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~subject, ncol = 2) +
theme(legend.position="none") +
labs(y = "Russia", x = NULL)
cor.test(data = frequency[frequency$subject == "Dardanelles",],
~ proportion + `Russia`)
##
## Pearson's product-moment correlation
##
## data: proportion and Russia
## t = 51.042, df = 7470, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4915007 0.5251281
## sample estimates:
## cor
## 0.5085082
cor.test(data = frequency[frequency$subject == "France",],
~ proportion + `Russia`)
##
## Pearson's product-moment correlation
##
## data: proportion and Russia
## t = 65.76, df = 7805, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5826256 0.6111792
## sample estimates:
## cor
## 0.5970915
france_sentiment <- france_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(title, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(france_sentiment, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title, ncol = 2, scales = "free_x")
dardanelles_sentiment <- dardanelles_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(title, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(dardanelles_sentiment, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title, ncol = 2, scales = "free_x")
russia_sentiment <- russia_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(title, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(russia_sentiment, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title, ncol = 2, scales = "free_x")
france_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
dardanelles_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
russia_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
france_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 50)
dardanelles_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 50)
russia_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 50)
france_book_words <- france_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
count(title, word, sort = TRUE) %>%
ungroup()
france_total_words <- france_book_words %>%
group_by(title) %>%
summarize(total = sum(n))
france_book_words <- left_join(france_book_words, france_total_words)
ggplot(france_book_words, aes(n/total, fill = title)) +
geom_histogram(show.legend = FALSE, bins = 25) +
xlim(NA, 0.0009) +
facet_wrap(~title, ncol = 2, scales = "free_y")
dardanelles_book_words <- dardanelles_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
count(title, word, sort = TRUE) %>%
ungroup()
dardanelles_total_words <- dardanelles_book_words %>%
group_by(title) %>%
summarize(total = sum(n))
dardanelles_book_words <- left_join(dardanelles_book_words, dardanelles_total_words)
ggplot(dardanelles_book_words, aes(n/total, fill = title)) +
geom_histogram(show.legend = FALSE, bins = 25) +
xlim(NA, 0.0009) +
facet_wrap(~title, ncol = 2, scales = "free_y")
russia_book_words <- russia_books %>%
unnest_tokens(input=text,
output="word",
to_lower=TRUE,
drop=TRUE) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
anti_join(stop_words, by=c("word"="word")) %>%
count(title, word, sort = TRUE) %>%
ungroup()
russia_total_words <- russia_book_words %>%
group_by(title) %>%
summarize(total = sum(n))
russia_book_words <- left_join(russia_book_words, russia_total_words)
ggplot(russia_book_words, aes(n/total, fill = title)) +
geom_histogram(show.legend = FALSE, bins = 25) +
xlim(NA, 0.0009) +
facet_wrap(~title, ncol = 2, scales = "free_y")
france_freq_by_rank <- france_book_words %>%
drop_na() %>%
group_by(title) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
france_rank_subset <- france_freq_by_rank %>%
filter(rank < 500,
rank > 10)
france_lm <- lm(log10(`term frequency`) ~ log10(rank), data = france_rank_subset)
france_freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = title)) +
geom_abline(intercept = france_lm$coefficients[1], slope = france_lm$coefficients[2], color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
dardanelles_freq_by_rank <- dardanelles_book_words %>%
drop_na() %>%
group_by(title) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
dardanelles_rank_subset <- dardanelles_freq_by_rank %>%
filter(rank < 500,
rank > 10)
dardanelles_lm <- lm(log10(`term frequency`) ~ log10(rank), data = dardanelles_rank_subset)
dardanelles_freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = title)) +
geom_abline(intercept = dardanelles_lm$coefficients[1], slope = dardanelles_lm$coefficients[2], color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
russia_freq_by_rank <- russia_book_words %>%
drop_na() %>%
group_by(title) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
russia_rank_subset <- russia_freq_by_rank %>%
filter(rank < 500,
rank > 10)
russia_lm <- lm(log10(`term frequency`) ~ log10(rank), data = russia_rank_subset)
russia_freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = title)) +
geom_abline(intercept = russia_lm$coefficients[1], slope = russia_lm$coefficients[2], color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
france_book_words <- france_book_words %>%
bind_tf_idf(word, title, n)
france_book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 25,599 x 6
## title word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 "The Children of France\r\nA Book … lad 63 0.00674 1.61 0.0108
## 2 "The Children of France\r\nA Book … remi 45 0.00481 1.61 0.00775
## 3 At Ypres with Best-Dunkley dunkley 90 0.00458 1.61 0.00736
## 4 "The Children of France\r\nA Book … mattia 30 0.00321 1.61 0.00517
## 5 At Ypres with Best-Dunkley andrews 63 0.00320 1.61 0.00515
## 6 "The Children of France\r\nA Book … marie 50 0.00535 0.916 0.00490
## 7 At Ypres with Best-Dunkley platoon 96 0.00488 0.916 0.00447
## 8 "Paris War Days\nDiary of an Ameri… centig… 48 0.00264 1.61 0.00425
## 9 "The Children of France\r\nA Book … prussi… 43 0.00460 0.916 0.00422
## 10 "Paris War Days\nDiary of an Ameri… paris 337 0.0185 0.223 0.00413
## # ... with 25,589 more rows
france_book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
dardanelles_book_words <- dardanelles_book_words %>%
bind_tf_idf(word, title, n)
dardanelles_book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 25,365 x 6
## title word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 On Land and Sea at the Dardanell… ken 698 0.0353 0.916 0.0324
## 2 On Land and Sea at the Dardanell… roy 373 0.0189 1.61 0.0304
## 3 "Five Months at Anzac\r\nA Narra… postage 129 0.0122 0.916 0.0112
## 4 On Land and Sea at the Dardanell… dave 82 0.00415 1.61 0.00668
## 5 "At Suvla Bay\r\nBeing the notes… hawk 73 0.00656 0.916 0.00601
## 6 On Land and Sea at the Dardanell… ken's 73 0.00369 1.61 0.00594
## 7 On Land and Sea at the Dardanell… carringt… 60 0.00304 1.61 0.00489
## 8 "Five Months at Anzac\r\nA Narra… revised 31 0.00294 1.61 0.00473
## 9 "Trenching at Gallipoli\r\nThe p… dugout 40 0.00264 1.61 0.00426
## 10 "Trenching at Gallipoli\r\nThe p… newfound… 40 0.00264 1.61 0.00426
## # ... with 25,355 more rows
dardanelles_book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
russia_book_words <- russia_book_words %>%
bind_tf_idf(word, title, n)
russia_book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 22,784 x 6
## title word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 "With the \"Die-Hards\" in Siberia" omsk 116 0.00411 1.39 0.00570
## 2 "The History of the American Exped… pvt 228 0.00356 1.39 0.00494
## 3 "With the \"Die-Hards\" in Siberia" czech 92 0.00326 1.39 0.00452
## 4 "With the \"Die-Hards\" in Siberia" japane… 174 0.00617 0.693 0.00427
## 5 Fighting Without a War: An Account… british 111 0.0147 0.288 0.00421
## 6 "With the \"Die-Hards\" in Siberia" koltch… 77 0.00273 1.39 0.00378
## 7 Fighting Without a War: An Account… russia 89 0.0117 0.288 0.00338
## 8 Fighting Without a War: An Account… bolshe… 85 0.0112 0.288 0.00323
## 9 Four Weeks in the Trenches: The Wa… trench 22 0.00441 0.693 0.00305
## 10 Fighting Without a War: An Account… kitsa 28 0.00370 0.693 0.00256
## # ... with 22,774 more rows
russia_book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
france_bigrams <- france_books %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
france_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 97,692 x 2
## bigram n
## <chr> <int>
## 1 <NA> 5344
## 2 of the 2217
## 3 in the 1385
## 4 to the 907
## 5 on the 610
## 6 at the 464
## 7 and the 422
## 8 for the 404
## 9 to be 403
## 10 it was 381
## # ... with 97,682 more rows
france_bigrams_separated <- france_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
france_bigrams_filtered <- france_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
france_bigram_counts <- france_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
france_bigrams_united <- france_bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
france_bigram_tf_idf <- france_bigrams_united %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
arrange(desc(tf_idf))
france_bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
france_bigram_counts %>%
drop_na() %>%
filter(n >= 15) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
dardanelles_bigrams <- dardanelles_books %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
dardanelles_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 102,363 x 2
## bigram n
## <chr> <int>
## 1 <NA> 6449
## 2 of the 1783
## 3 in the 1143
## 4 to the 775
## 5 and the 671
## 6 on the 659
## 7 it was 627
## 8 in a 404
## 9 from the 401
## 10 at the 390
## # ... with 102,353 more rows
dardanelles_bigrams_separated <- dardanelles_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
dardanelles_bigrams_filtered <- dardanelles_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
dardanelles_bigram_counts <- dardanelles_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
dardanelles_bigrams_united <- dardanelles_bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
dardanelles_bigram_tf_idf <- dardanelles_bigrams_united %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
arrange(desc(tf_idf))
dardanelles_bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
dardanelles_bigram_counts %>%
drop_na() %>%
filter(n >= 15) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
russia_bigrams <- russia_books %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
russia_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 109,418 x 2
## bigram n
## <chr> <int>
## 1 <NA> 4220
## 2 of the 2776
## 3 in the 1371
## 4 to the 1081
## 5 on the 878
## 6 and the 691
## 7 it was 505
## 8 for the 497
## 9 by the 493
## 10 had been 444
## # ... with 109,408 more rows
russia_bigrams_separated <- russia_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
russia_bigrams_filtered <- russia_bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
russia_bigram_counts <- russia_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
russia_bigrams_united <- russia_bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
russia_bigram_tf_idf <- russia_bigrams_united %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
arrange(desc(tf_idf))
russia_bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(title) %>%
top_n(5) %>%
ungroup %>%
ggplot(aes(bigram, tf_idf, fill = title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~title, ncol = 2, scales = "free") +
coord_flip()
russia_bigram_counts %>%
drop_na() %>%
filter(n >= 20) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
france_bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 430 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not only 52
## 2 not a 50
## 3 not be 46
## 4 not yet 32
## 5 not been 30
## 6 not know 29
## 7 not to 27
## 8 not the 25
## 9 not get 17
## 10 not have 15
## # ... with 420 more rows
france_not_words <- france_bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
france_not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
france_negated_words <- france_bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
france_negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
dardanelles_bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 330 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not a 53
## 2 not to 27
## 3 not the 25
## 4 not only 24
## 5 not be 20
## 6 not so 17
## 7 not know 16
## 8 not been 15
## 9 not yet 15
## 10 not get 14
## # ... with 320 more rows
dardanelles_not_words <- dardanelles_bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
dardanelles_not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
dardanelles_negated_words <- dardanelles_bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
dardanelles_negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
russia_bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 411 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not be 40
## 2 not a 38
## 3 not to 35
## 4 not the 32
## 5 not know 24
## 6 not only 23
## 7 not been 22
## 8 not have 19
## 9 not one 18
## 10 not see 16
## # ... with 401 more rows
russia_not_words <- russia_bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
russia_not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
russia_negated_words <- russia_bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
russia_negated_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
france_bigram_graph <- france_bigram_counts %>%
drop_na() %>%
filter(n > 10) %>%
graph_from_data_frame()
france_bigram_graph
## IGRAPH a402b8f DN-- 79 50 --
## + attr: name (v/c), n (e/n)
## + edges from a402b8f (vertex names):
## [1] red ->cross captain ->andrews degrees ->centigrade
## [4] major ->brighten alsace ->lorraine machine ->gun
## [7] sergeant ->major lancashire->fusiliers front ->line
## [10] herr ->von lance ->corporal sergeant ->baldwin
## [13] french ->army machine ->guns 2 ->5th
## [16] foreign ->affairs french ->soldiers bilge ->trench
## [19] german ->army de ->la rue ->de
## [22] war ->office le ->capitaine military ->governor
## + ... omitted several edges
set.seed(2018)
ggraph(france_bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(france_bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
dardanelles_bigram_graph <- dardanelles_bigram_counts %>%
drop_na() %>%
filter(n > 10) %>%
graph_from_data_frame()
dardanelles_bigram_graph
## IGRAPH a670099 DN-- 78 57 --
## + attr: name (v/c), n (e/n)
## + edges from a670099 (vertex names):
## [1] _postage ->1d firing ->line 6d ->_postage
## [4] 3s ->6d hundred ->yards stretcher->bearers
## [7] _postage ->2d salt ->lake cloth ->gilt
## [10] machine ->gun ken ->answered red ->cross
## [13] suvla ->bay bully ->beef gilt ->gilt
## [16] top ->3s 6s ->_postage dressing ->station
## [19] gilt ->top edges ->6s hospital ->ship
## [22] morocco ->gilt barbed ->wire captain ->carrington
## + ... omitted several edges
set.seed(2018)
ggraph(dardanelles_bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(dardanelles_bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
russia_bigram_graph <- russia_bigram_counts %>%
drop_na() %>%
filter(n > 10) %>%
graph_from_data_frame()
russia_bigram_graph
## IGRAPH 88c7121 DN-- 138 117 --
## + attr: name (v/c), n (e/n)
## + edges from 88c7121 (vertex names):
## [1] north ->russia machine ->gun 339th ->inf
## [4] red ->cross machine ->guns american ->soldiers
## [7] official ->photo north ->russian american ->troops
## [10] ust ->padenga red ->guards admiral ->koltchak
## [13] russian ->people british ->officer russian ->army
## [16] british ->officers commanding->officer dvina ->river
## [19] american ->officer american ->soldier supreme ->governor
## [22] 310th ->engrs
## + ... omitted several edges
set.seed(2018)
ggraph(russia_bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(russia_bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
france_section_words <- france_books %>%
mutate(section = row_number() %/% 10) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
filter(!word %in% stop_words$word)
france_section_words
## # A tibble: 83,918 x 5
## gutenberg_id title linenumber section word
## <int> <chr> <int> <dbl> <chr>
## 1 9975 "Paris War Days\nDiary of an A… 11 1 diary
## 2 9975 "Paris War Days\nDiary of an A… 11 1 americ…
## 3 9975 "Paris War Days\nDiary of an A… 19 1 charles
## 4 9975 "Paris War Days\nDiary of an A… 19 1 inman
## 5 9975 "Paris War Days\nDiary of an A… 19 1 barnard
## 6 9975 "Paris War Days\nDiary of an A… 19 1 ll
## 7 9975 "Paris War Days\nDiary of an A… 19 1 harvard
## 8 9975 "Paris War Days\nDiary of an A… 21 2 knight
## 9 9975 "Paris War Days\nDiary of an A… 21 2 legion
## 10 9975 "Paris War Days\nDiary of an A… 21 2 honor
## # ... with 83,908 more rows
france_word_pairs <- france_section_words %>%
pairwise_count(word, section, sort = TRUE) %>%
drop_na()
france_word_pairs
## # A tibble: 2,042,368 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 french german 119
## 2 german french 119
## 3 french war 105
## 4 war french 105
## 5 day war 104
## 6 war day 104
## 7 france war 92
## 8 war france 92
## 9 german war 89
## 10 war german 89
## # ... with 2,042,358 more rows
france_word_cors <- france_section_words %>%
na.omit() %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
france_word_cors
## # A tibble: 652,056 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 centigrade degrees 0.906
## 2 degrees centigrade 0.906
## 3 lancashire fusiliers 0.820
## 4 fusiliers lancashire 0.820
## 5 corporal lance 0.806
## 6 lance corporal 0.806
## 7 lorraine alsace 0.773
## 8 alsace lorraine 0.773
## 9 centigrade thermometer 0.704
## 10 thermometer centigrade 0.704
## # ... with 652,046 more rows
france_word_cors %>%
filter(item1 %in% c("lorraine", "lancashire", "fusiliers", "von")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
set.seed(2018)
france_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
france_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
dardanelles_section_words <- dardanelles_books %>%
mutate(section = row_number() %/% 10) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
filter(!word %in% stop_words$word)
dardanelles_section_words
## # A tibble: 86,828 x 5
## gutenberg_id title linenumber section word
## <int> <chr> <int> <dbl> <chr>
## 1 3306 "At Suvla Bay\r\nBeing the not… 11 1 serving
## 2 3306 "At Suvla Bay\r\nBeing the not… 11 1 field
## 3 3306 "At Suvla Bay\r\nBeing the not… 11 1 ambula…
## 4 3306 "At Suvla Bay\r\nBeing the not… 11 1 divisi…
## 5 3306 "At Suvla Bay\r\nBeing the not… 11 1 medite…
## 6 3306 "At Suvla Bay\r\nBeing the not… 12 1 expedi…
## 7 3306 "At Suvla Bay\r\nBeing the not… 12 1 force
## 8 3306 "At Suvla Bay\r\nBeing the not… 12 1 war
## 9 3306 "At Suvla Bay\r\nBeing the not… 18 1 minobi
## 10 3306 "At Suvla Bay\r\nBeing the not… 20 2 played
## # ... with 86,818 more rows
dardanelles_word_pairs <- dardanelles_section_words %>%
pairwise_count(word, section, sort = TRUE) %>%
drop_na()
dardanelles_word_pairs
## # A tibble: 2,169,572 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 roy ken 215
## 2 ken roy 215
## 3 ken time 83
## 4 time ken 83
## 5 ken answered 80
## 6 answered ken 80
## 7 postage cloth 76
## 8 cloth postage 76
## 9 ken moment 67
## 10 moment ken 67
## # ... with 2,169,562 more rows
dardanelles_word_cors <- dardanelles_section_words %>%
na.omit() %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
dardanelles_word_cors
## # A tibble: 818,120 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 morocco gilt 0.905
## 2 gilt morocco 0.905
## 3 edges morocco 0.904
## 4 morocco edges 0.904
## 5 lake salt 0.876
## 6 salt lake 0.876
## 7 edges gilt 0.859
## 8 gilt edges 0.859
## 9 beef bully 0.836
## 10 bully beef 0.836
## # ... with 818,110 more rows
dardanelles_word_cors %>%
filter(item1 %in% c("edges", "morocco", "jhill", "gilt")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
set.seed(2018)
dardanelles_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
dardanelles_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
russia_section_words <- russia_books %>%
mutate(section = row_number() %/% 10) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
mutate(word=str_extract(word,"[a-z']+")) %>%
filter(!word %in% stop_words$word)
russia_section_words
## # A tibble: 104,754 x 5
## gutenberg_id title linenumber section word
## <int> <chr> <int> <dbl> <chr>
## 1 10967 Four Weeks in the Trenches: Th… 10 1 life
## 2 10967 Four Weeks in the Trenches: Th… 10 1 dedica…
## 3 10967 Four Weeks in the Trenches: Th… 10 1 book
## 4 10967 Four Weeks in the Trenches: Th… 12 1 humble
## 5 10967 Four Weeks in the Trenches: Th… 12 1 token
## 6 10967 Four Weeks in the Trenches: Th… 12 1 everla…
## 7 10967 Four Weeks in the Trenches: Th… 12 1 gratit…
## 8 10967 Four Weeks in the Trenches: Th… 12 1 devoti…
## 9 10967 Four Weeks in the Trenches: Th… 17 1 preface
## 10 10967 Four Weeks in the Trenches: Th… 19 1 record
## # ... with 104,744 more rows
russia_word_pairs <- russia_section_words %>%
pairwise_count(word, section, sort = TRUE) %>%
drop_na()
russia_word_pairs
## # A tibble: 2,576,030 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 russian american 167
## 2 american russian 167
## 3 british american 146
## 4 american british 146
## 5 british russian 141
## 6 north russia 141
## 7 russia north 141
## 8 russian british 141
## 9 archangel american 138
## 10 american archangel 138
## # ... with 2,576,020 more rows
russia_word_cors <- russia_section_words %>%
na.omit() %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
russia_word_cors
## # A tibble: 1,097,256 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 padenga ust 0.910
## 2 ust padenga 0.910
## 3 pvt mich 0.734
## 4 mich pvt 0.734
## 5 photo official 0.724
## 6 official photo 0.724
## 7 photo illustration 0.723
## 8 illustration photo 0.723
## 9 governor supreme 0.686
## 10 supreme governor 0.686
## # ... with 1,097,246 more rows
russia_word_cors %>%
filter(item1 %in% c("padenga", "revolutionary", "comrade", "lieut")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
set.seed(2018)
russia_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
russia_word_cors %>%
filter(correlation > .25) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()